#Althoff IFN data processing

This analysis is to look at why Scribble KO HSC are insensitive to activating signals from interferon. To understand this WT and Scibble KO mice are treated or not with PolyIC an interferon stimulator. There are 4 groups here with WT, KO, WTIC, and KOIC. There are 3 mice with untreated condition and 4 mice per treated condition.

Pre-processing

  1. Adapters were trimmed with FASTP 0.21.0
  2. Gene expression was quantified using salmon v1.3.0
  3. TPMs were obtained for the genes using tximport 1.20.0
library(ggplot2)
library(DESeq2)
library(tximport)
library(readr)
library(tximportData)
library(readxl)
library(knitr)
library(tidyverse)
library(pheatmap)
library(RColorBrewer)
library(viridis)
library(ggrepel)
library(EnhancedVolcano)
library(fgsea)
library(limma)
library(VennDiagram)
library(UpSetR)
library(wesanderson)
library(kableExtra)
library(reshape)
library(dplyr)
library(msigdbr)

Load color palettes

Load metrics

## New names:
## * `` -> ...1
Summary of Data Metrics
samples reads before trim reads after trim %Q30 duplication % of reads with adapter Insert size percent aligned splices annotated salmon percent aligned
WT_rep1 61002551 60449791 92.4929 12.0915 9.561156 118 78.64 9198083 59.6994
WT_rep2 59366695 58759406 91.5568 11.6178 8.725623 119 79.87 10097467 62.0906
WT_rep3 70194654 69436669 92.0470 12.9095 8.320139 119 79.59 11066266 62.8137
KO_rep1 63782461 63126127 91.7028 12.4218 1.109101 119 72.87 8375861 51.9990
KO_rep2 66128187 65388713 91.3069 13.5671 8.849251 117 77.00 8346970 56.2829
KO_rep3 63880365 63168300 91.0102 13.0646 12.360554 119 71.95 8921300 56.4861
WTIC_rep1 41050763 40183636 90.1762 13.9266 3.044598 119 80.23 5500867 53.5136
WTIC_rep2 39474990 38752578 91.7329 12.6450 4.681941 119 81.57 5055002 52.9577
WTIC_rep3 38524980 37699529 89.4794 14.3038 3.207358 118 79.34 5343450 55.9974
WTIC_rep4 37942449 37173849 90.6850 12.4860 3.233808 119 80.16 4726578 52.9712
KOIC_rep1 41161158 40368325 91.3712 14.3709 3.844799 117 80.44 4962629 52.4011
KOIC_rep2 40345747 39535690 90.9779 13.8305 3.955074 119 80.05 5199790 53.6240
KOIC_rep3 43540976 42726703 91.7218 14.2954 3.428689 119 80.06 5860118 55.7803
KOIC_rep4 41799389 40956452 90.8947 14.8623 3.349685 119 80.46 4838976 52.2488
Metadata Table
Sample Condition
WT_rep1 WT
WT_rep2 WT
WT_rep3 WT
KO_rep1 KO
KO_rep2 KO
KO_rep3 KO
WTIC_rep1 WTIC
WTIC_rep2 WTIC
WTIC_rep3 WTIC
WTIC_rep4 WTIC
KOIC_rep1 KOIC
KOIC_rep2 KOIC
KOIC_rep3 KOIC
KOIC_rep4 KOIC

We see high correlation between all samples that were treated regardless of phenotype. In the untreated samples there was less correlation this may be due to the mouse variability.

PCA plot of non-normalized data

PC1 vs PC2

This doesn’t seem to match the correlation data which indicated that the treated samples were more similar to each other.

Run Differential Expression testing using DESeq2 and Calculate Gene Set Enrichment

Compare WTvKO, WTvWTIC, KOvKOIC, WTICvKOIC

sig = padj <0.01 and abs(l2fc) >0.5

## [1] TRUE
## [1] TRUE
## using just counts from tximport
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## using pre-existing size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## [1] 4319
## [1] 1180
## [1] 31
## [1] 4329
kable(significant_genes)
comparison sig.genes
WTvsWTIC 4319
WTvKO 1180
WTICvKOIC 31
KOvKOIC 4329
DEG_WTvKO<-as.data.frame(res_WTvKO) %>%
  filter(padj<.01 & abs(log2FoldChange) >= .5)%>%
  rownames_to_column(var="ensembl_gene_id")%>%
  left_join(ens2gene,by="ensembl_gene_id")%>%
  select(Gene, padj, pvalue, log2FoldChange)%>%
  arrange(padj)


DEG_WTvWTIC<-as.data.frame(res_WTvWTIC)%>%
  filter(padj<.01 & abs(log2FoldChange) >= .5)%>%
  rownames_to_column(var="ensembl_gene_id")%>%
  left_join(ens2gene,by="ensembl_gene_id")%>%
  select(Gene, padj, pvalue, log2FoldChange)%>%
  arrange(padj)

DEG_WTICvKOIC<-as.data.frame(res_WTICvKOIC)%>%
  filter(padj<.01 & abs(log2FoldChange) >= .5)%>%
  rownames_to_column(var="ensembl_gene_id")%>%
  left_join(ens2gene,by="ensembl_gene_id")%>%
  select(Gene, padj, pvalue, log2FoldChange)%>%
  arrange(padj)

DEG_KOvKOIC<-as.data.frame(res_KOvKOIC)%>%
  filter(padj<.01 & abs(log2FoldChange) >= .5)%>%
  rownames_to_column(var="ensembl_gene_id")%>%
  left_join(ens2gene,by="ensembl_gene_id")%>%
  select(Gene, padj, pvalue, log2FoldChange)%>%
  arrange(padj)
plotDispEsts(dds.filtered)

# Volcano Plots

## Warning: Removed 6293 rows containing missing values (geom_point).

## Warning: Removed 4220 rows containing missing values (geom_point).

## Warning: Removed 4220 rows containing missing values (geom_point).

## Warning: Removed 4770 rows containing missing values (geom_point).

MA Plots

PCA analysis

## Warning: Unknown or uninitialised column: `Mouse_genotype`.

Top 20 DEG plots

top_20_WTvKO<-DEG_WTvKO[1:20, 1]
top_20_WTvWTIC<-DEG_WTvWTIC[1:20, 1]
top_20_KOvKOIC<-DEG_KOvKOIC[1:20, 1]
top_20_WTICvKOIC<-DEG_WTvWTIC[1:20, 1]

normalized_counts <- counts(dds.filtered, normalized=T)
normalized_counts<-as.data.frame(normalized_counts)
normalized_counts<-rownames_to_column(normalized_counts, var="ensembl_gene_id")
normalized_counts<-inner_join(normalized_counts, ens2gene, by="ensembl_gene_id")


top20_norm_counts_WTvKO <- filter(normalized_counts, Gene %in% top_20_WTvKO)%>%
  select(Gene, WT_rep1, WT_rep2, WT_rep3, KO_rep1, KO_rep2, KO_rep3)
## stopped here there is something wrong
melted_top20_norm_counts_WTvKO <- data.frame(melt(top20_norm_counts_WTvKO))
## Using Gene as id variables
colnames(melted_top20_norm_counts_WTvKO)<-c("Gene", "Sample", "normalized_counts")
melted_top20_norm_counts_WTvKO<-full_join(melted_top20_norm_counts_WTvKO,metadata, by="Sample")
melted_top20_norm_counts_WTvKO<-filter(melted_top20_norm_counts_WTvKO, Condition!=c("WTIC", "KOIC"))
ggplot(melted_top20_norm_counts_WTvKO) +
        geom_point(aes(x = Gene, y = normalized_counts, color = Condition)) +
        scale_y_log10() +
        xlab("Genes") +
        ylab("Normalized Counts") +
        ggtitle("Top 20 Significant DE Genes") +
        theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    theme(plot.title=element_text(hjust=0.5))
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 4 rows containing missing values (geom_point).

kable(top20_norm_counts_WTvKO)
Gene WT_rep1 WT_rep2 WT_rep3 KO_rep1 KO_rep2 KO_rep3
Ldhc 0.00000 0.0000 0.0000 0.000000 0.000000 106.04029
Ttc16 0.00000 0.0000 167.7153 0.000000 0.000000 0.00000
Gm15429 0.00000 0.0000 0.0000 0.000000 427.015366 0.00000
Olfr248 0.00000 0.0000 0.0000 209.025545 0.000000 0.00000
Cyp4f39 0.00000 0.0000 0.0000 126.581207 0.000000 0.00000
Gm10221 0.00000 0.0000 0.0000 0.000000 0.000000 139.93841
Gm3086 0.00000 0.0000 0.0000 0.000000 0.000000 89.52581
Gm17022 86.44889 0.0000 0.0000 0.000000 0.000000 0.00000
Gm26571 0.00000 0.0000 0.0000 0.000000 234.898812 0.00000
H2-Pa 0.00000 0.0000 0.0000 85.775423 0.000000 0.00000
Gm37571 0.00000 0.0000 0.0000 0.000000 89.600578 0.00000
Gm38289 0.00000 0.0000 0.0000 0.000000 269.608946 0.00000
Gm10728 0.00000 0.0000 0.0000 83.277110 0.000000 0.00000
Gm19148 0.00000 0.0000 0.0000 0.000000 0.000000 217.29567
Gm43890 0.00000 0.0000 0.0000 0.000000 104.130401 0.00000
Gm45399 130.90833 142.9232 343.3146 5.829398 5.650487 0.00000
Gm49369 0.00000 0.0000 0.0000 213.189401 0.000000 0.00000
Gm45632 0.00000 0.0000 0.0000 94.103134 0.000000 0.00000
Gm47502 0.00000 0.0000 0.0000 0.000000 164.671332 0.00000
Gm49522 0.00000 0.0000 0.0000 0.000000 0.000000 95.61009
ens2gene <- unique(ens2gene)
#pulls the hallmark gene set for mouse
h_gene_sets_mouse = msigdbr(species = "mouse", category = "H")
mouse.hallmark.list = base::split(x = h_gene_sets_mouse$gene_symbol, f = h_gene_sets_mouse$gs_name)

#tidying data

WTvKO_gsea<-results(dds.filtered, contrast=c("Condition", "WT", "KO"),tidy=TRUE)
colnames(WTvKO_gsea)[1]<-"ensembl_gene_id"
WTvKO_gsea<-inner_join(WTvKO_gsea, ens2gene, by="ensembl_gene_id")
WTvKO_gsea_sum<-WTvKO_gsea%>%
  dplyr::select(Gene, stat) %>%
  na.omit() %>%
  distinct() %>%
  group_by(Gene) %>%
  summarise(stat=mean(stat))

rank_posWTvKO<-deframe(WTvKO_gsea_sum)
fgsea_posWTvKO_hallmark<- fgsea(pathways=mouse.hallmark.list, stats=rank_posWTvKO)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.29% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
fgseaRes_posWTvKO_tidy <-fgsea_posWTvKO_hallmark%>%
  as_tibble() %>%
  arrange(desc(NES))

WTvWTIC_gsea<-results(dds.filtered, contrast=c("Condition", "WT", "WTIC"), tidy=TRUE)
colnames(WTvWTIC_gsea)[1]<-"ensembl_gene_id"
WTvWTIC_gsea<-inner_join(WTvWTIC_gsea, ens2gene, by="ensembl_gene_id")
WTvWTIC_gsea_sum<-WTvWTIC_gsea%>%
  dplyr::select(Gene, stat) %>%
  na.omit() %>%
  distinct() %>%
  group_by(Gene) %>%
  summarise(stat=mean(stat))

rank_posWTvWTIC<-deframe(WTvWTIC_gsea_sum)
fgsea_posWTvWTIC_hallmark<- fgsea(pathways=mouse.hallmark.list, stats=rank_posWTvWTIC)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.18% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
fgseaRes_posWTvWTIC_tidy <-fgsea_posWTvWTIC_hallmark%>%
  as_tibble() %>%
  arrange(desc(NES))

KOvKOIC_gsea<-results(dds.filtered, contrast=c("Condition","KO", "KOIC"),tidy=TRUE)
colnames(KOvKOIC_gsea)[1]<-"ensembl_gene_id"
KOvKOIC_gsea<-inner_join(KOvKOIC_gsea, ens2gene, by="ensembl_gene_id")
KOvKOIC_gsea_sum<-KOvKOIC_gsea%>%
  dplyr::select(Gene, stat) %>%
  na.omit() %>%
  distinct() %>%
  group_by(Gene) %>%
  summarise(stat=mean(stat))

rank_posKOvKOIC<-deframe(KOvKOIC_gsea_sum)
fgsea_posKOvKOIC_hallmark<- fgsea(pathways=mouse.hallmark.list, stats=rank_posKOvKOIC)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.16% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.

## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : For some pathways, in reality P-values are less than 1e-10. You can set the `eps` argument to zero for better estimation.
fgseaRes_posKOvKOIC_tidy <-fgsea_posKOvKOIC_hallmark%>%
  as_tibble() %>%
  arrange(desc(NES))

WTICvKOIC_gsea<-results(dds.filtered, contrast=c("Condition", "WTIC", "KOIC"),tidy=TRUE)
colnames(WTICvKOIC_gsea)[1]<-"ensembl_gene_id"
WTICvKOIC_gsea<-inner_join(WTICvKOIC_gsea, ens2gene, by="ensembl_gene_id")
WTICvKOIC_gsea_sum<-WTICvKOIC_gsea%>%
  dplyr::select(Gene, stat) %>%
  na.omit() %>%
  distinct() %>%
  group_by(Gene) %>%
  summarise(stat=mean(stat))

rank_posWTICvKOIC<-deframe(WTICvKOIC_gsea_sum)
fgsea_posWTICvKOIC_hallmark<- fgsea(pathways=mouse.hallmark.list, stats=rank_posWTICvKOIC)
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0.24% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.

## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : For some pathways, in reality P-values are less than 1e-10. You can set the `eps` argument to zero for better estimation.
fgseaRes_posWTICvKOIC_tidy <-fgsea_posWTICvKOIC_hallmark%>%
  as_tibble() %>%
  arrange(desc(NES))